In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from __future__ import print_function
import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
from datetime import datetime
import os

%matplotlib inline
%config InlineBackend.figure_format = 'png'
pd.set_option("max_columns",50)

In [2]:
%%time
train = pd.read_csv("../data/train_2013.csv", index_col=0)
train = train.reset_index(drop=True)
train = train[train["is_booking"] == 1]
np.random.seed(402)
train = train.ix[np.random.choice(train.index, 50000)]
train = train.reset_index(drop=True)


Wall time: 41.9 s

In [3]:
%%time

print('preprocessing train_data')
use_col = ["srch_co","srch_ci","srch_destination_id","hotel_country","srch_adults_cnt","srch_children_cnt","hotel_cluster"]

train_y = train[["hotel_cluster"]]

train_x = train[use_col]
train_x["srch_ci"] = pd.to_datetime(train_x["srch_ci"], errors="coerce")
train_x["srch_co"] = pd.to_datetime(train_x["srch_co"], errors="coerce")
train_x["period"] = train_x["srch_co"] - train_x["srch_ci"]
train_x["period"] = (train_x["period"] / np.timedelta64(1, 'D')).astype(int)
train_x = train_x.drop(["srch_co","srch_ci"], axis=1)
train_x["srch_adults_cnt"] = train_x["srch_adults_cnt"].apply(lambda x: 3 if x>=3 else x)
train_x = train_x.drop(["srch_children_cnt"], axis=1)
train_x = train_x[["srch_destination_id","hotel_country","srch_adults_cnt","period"]]


preprocessing train_data
C:\Users\Byeon\Anaconda3\envs\py27\lib\site-packages\ipykernel\__main__.py:8: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
C:\Users\Byeon\Anaconda3\envs\py27\lib\site-packages\ipykernel\__main__.py:9: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
C:\Users\Byeon\Anaconda3\envs\py27\lib\site-packages\ipykernel\__main__.py:10: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
Wall time: 617 ms
C:\Users\Byeon\Anaconda3\envs\py27\lib\site-packages\ipykernel\__main__.py:11: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

In [6]:
train_x.head()


Out[6]:
srch_destination_id hotel_country srch_adults_cnt period
0 12696 8 1 1
1 12189 50 2 4
2 2758 31 1 8
3 8267 50 3 2
4 18741 50 1 3

In [8]:
train["hotel_cluster"].unique()


Out[8]:
array([15, 72, 58, 56, 42,  0, 96, 18, 95, 91, 11, 43, 12, 46, 26,  5,  2,
       16, 70, 33, 21,  4, 51, 40, 20, 89, 48, 82,  1, 79,  8, 50, 97, 47,
       63, 55, 61, 77,  6, 59,  3, 49, 64, 41, 13, 94, 98, 19, 28, 14, 37,
       62, 10, 36, 74, 80, 44, 31, 32, 29, 45,  7, 87, 99, 35, 68, 57,  9,
       81, 86, 60, 30, 52, 39, 75, 83, 78, 65, 25, 88, 90, 69, 71, 76, 23,
       66, 67, 54, 53, 92, 17, 85, 24, 22, 84, 38, 73, 34, 93, 27], dtype=int64)

In [13]:
train.groupby(['srch_destination_id',
                         'hotel_cluster'])['is_booking'].agg(['sum','count'])

# srch_destination_id = ex) 도쿄라고 했을때 나오는 그룹 => hotel_cluster 고로, srch_destination_id 와 hotel_country는 유사할것임


Out[13]:
sum count
srch_destination_id hotel_cluster
2 20 1 1
4 67 1 1
78 1 1
81 1 1
8 7 1 1
32 1 1
42 1 1
48 1 1
76 1 1
11 91 1 1
14 20 1 1
61 1 1
16 15 1 1
85 1 1
19 64 1 1
21 62 1 1
67 3 3
82 1 1
89 1 1
24 3 1 1
23 1 1
42 1 1
47 2 2
60 1 1
76 2 2
91 3 3
25 5 1 1
10 1 1
13 2 2
32 1 1
... ... ... ...
60988 41 1 1
68 1 1
61097 28 1 1
72 1 1
61102 95 1 1
61128 12 1 1
61193 30 1 1
36 2 2
61306 60 1 1
61413 29 1 1
62 1 1
61418 58 1 1
61442 5 1 1
61528 32 1 1
49 1 1
72 1 1
61531 10 1 1
61533 11 1 1
41 2 2
83 1 1
61702 33 1 1
47 1 1
48 1 1
91 2 2
61756 56 2 2
72 1 1
77 1 1
62487 6 1 1
62508 32 1 1
62824 21 1 1

22857 rows × 2 columns

이게 baseline 앞으론 남들과 다르게 예약한 사람을 찾아서 그들을 지켜보자


In [2]:
%%time
train = pd.read_csv("../data/train_2013.csv", index_col=0)
train = train.reset_index(drop=True)
np.random.seed(402)
train = train.ix[np.random.choice(train.index, 50000)]
train = train.reset_index(drop=True)


print('preprocessing train_data')
use_col = ["srch_co","srch_ci","user_location_region",\
               "hotel_market","srch_destination_id","hotel_country","srch_adults_cnt","srch_children_cnt","hotel_cluster"]

train_y = train[["hotel_cluster"]]

train_x = train[use_col]
train_x["srch_ci"] = pd.to_datetime(train_x["srch_ci"], errors="coerce")
train_x["srch_co"] = pd.to_datetime(train_x["srch_co"], errors="coerce")
train_x["period"] = train_x["srch_co"] - train_x["srch_ci"]
train_x["period"] = (train_x["period"] / np.timedelta64(1, 'D')).fillna(0.0).astype(int)
train_x = train_x.drop(["srch_co","srch_ci"], axis=1)
train_x["srch_adults_cnt"] = train_x["srch_adults_cnt"].apply(lambda x: 3 if x>=3 else x)
train_x = train_x.drop(["srch_children_cnt"], axis=1)
train_x = train_x[["hotel_market","srch_destination_id","hotel_country","srch_adults_cnt","period","user_location_region"]]



use_col = ["srch_co","srch_ci","user_location_region",\
               "hotel_market","srch_destination_id","hotel_country","srch_adults_cnt","srch_children_cnt"]
print("read the test.csv")
test = pd.read_csv("../data/test.csv")
test = test[use_col]

print("preprocessing test_data")

test["srch_ci"] = pd.to_datetime(test["srch_ci"], errors="coerce")
test["srch_co"] = pd.to_datetime(test["srch_co"], errors="coerce")
test["period"] = test["srch_co"] - test["srch_ci"]
test["period"] = (test["period"] / np.timedelta64(1, 'D')).fillna(0.0).astype(int)
test = test.drop(["srch_co","srch_ci"], axis=1)
test["num"] = 1
test["srch_adults_cnt"] = test["srch_adults_cnt"].apply(lambda x: 3 if x>=3 else x)
test = test.drop(["num","srch_children_cnt"], axis=1)

test = test[["hotel_market","srch_destination_id","hotel_country","srch_adults_cnt","period","user_location_region"]]

print("modeling strart")
model = RandomForestClassifier(n_estimators=10, max_depth=7, n_jobs=-1, random_state=777)
print('='*50)
print('# Test shape : {}'.format(test.shape))

model.fit(train_x,train_y)

preds = model.predict_proba(test)
preds = np.fliplr(np.argsort(preds, axis=1))

print("save file")

result_df = pd.DataFrame([ " ".join(row) for row in preds[:,:5].astype(str)], columns=["hotel_cluster"])
result_df.index.names = ["id"]
file_name = datetime.now().strftime("result_%Y%m%d%H%M%S") + '.csv'
result_df.to_csv(os.path.join('../output',file_name), index=True)


preprocessing train_data
C:\Users\Byeon\Anaconda3\envs\py27\lib\site-packages\ipykernel\__main__.py:14: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
C:\Users\Byeon\Anaconda3\envs\py27\lib\site-packages\ipykernel\__main__.py:15: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
C:\Users\Byeon\Anaconda3\envs\py27\lib\site-packages\ipykernel\__main__.py:16: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-2-fb4819ba599b> in <module>()
----> 1 get_ipython().run_cell_magic(u'time', u'', u'%%time\ntrain = pd.read_csv("../data/train_2013.csv", index_col=0)\ntrain = train.reset_index(drop=True)\nnp.random.seed(402)\ntrain = train.ix[np.random.choice(train.index, 50000)]\ntrain = train.reset_index(drop=True)\n\n\nprint(\'preprocessing train_data\')\nuse_col = ["srch_co","srch_ci","user_location_region",\\\n               "hotel_market","srch_destination_id","hotel_country","srch_adults_cnt","srch_children_cnt","hotel_cluster"]\n\ntrain_y = train[["hotel_cluster"]]\n\ntrain_x = train[use_col]\ntrain_x["srch_ci"] = pd.to_datetime(train_x["srch_ci"], errors="coerce")\ntrain_x["srch_co"] = pd.to_datetime(train_x["srch_co"], errors="coerce")\ntrain_x["period"] = train_x["srch_co"] - train_x["srch_ci"]\ntrain_x["period"] = (train_x["period"] / np.timedelta64(1, \'D\')).astype(int)\ntrain_x = train_x.drop(["srch_co","srch_ci"], axis=1)\ntrain_x["srch_adults_cnt"] = train_x["srch_adults_cnt"].apply(lambda x: 3 if x>=3 else x)\ntrain_x = train_x.drop(["srch_children_cnt"], axis=1)\ntrain_x = train_x[["hotel_market","srch_destination_id","hotel_country","srch_adults_cnt","period","user_location_region"]]\n\n\n\nuse_col = ["srch_co","srch_ci","user_location_region",\\\n               "hotel_market","srch_destination_id","hotel_country","srch_adults_cnt","srch_children_cnt"]\nprint("read the test.csv")\ntest = pd.read_csv("../data/test.csv")\ntest = test[use_col]\n\nprint("preprocessing test_data")\n\ntest["srch_ci"] = pd.to_datetime(test["srch_ci"], errors="coerce")\ntest["srch_co"] = pd.to_datetime(test["srch_co"], errors="coerce")\ntest["period"] = test["srch_co"] - test["srch_ci"]\ntest["period"] = (test["period"] / np.timedelta64(1, \'D\')).fillna(0.0).astype(int)\ntest = test.drop(["srch_co","srch_ci"], axis=1)\ntest["num"] = 1\ntest["srch_adults_cnt"] = test["srch_adults_cnt"].apply(lambda x: 3 if x>=3 else x)\ntest = test.drop(["num","srch_children_cnt"], axis=1)\n\ntest = test[["hotel_market","srch_destination_id","hotel_country","srch_adults_cnt","period","user_location_region"]]\n\nprint("modeling strart")\nmodel = RandomForestClassifier(n_estimators=10, max_depth=7, n_jobs=-1, random_state=777)\nprint(\'=\'*50)\nprint(\'# Test shape : {}\'.format(test.shape))\n\nmodel.fit(train_x,train_y)\n\npreds = model.predict_proba(test)\npreds = np.fliplr(np.argsort(preds, axis=1))\n\nprint("save file")\n\nresult_df = pd.DataFrame([ " ".join(row) for row in preds[:,:5].astype(str)], columns=["hotel_cluster"])\nresult_df.index.names = ["id"]\nfile_name = datetime.now().strftime("result_%Y%m%d%H%M%S") + \'.csv\'\nresult_df.to_csv(os.path.join(\'../output\',file_name), index=True)')

C:\Users\Byeon\Anaconda3\envs\py27\lib\site-packages\IPython\core\interactiveshell.pyc in run_cell_magic(self, magic_name, line, cell)
   2113             magic_arg_s = self.var_expand(line, stack_depth)
   2114             with self.builtin_trap:
-> 2115                 result = fn(magic_arg_s, cell)
   2116             return result
   2117 

<decorator-gen-60> in time(self, line, cell, local_ns)

C:\Users\Byeon\Anaconda3\envs\py27\lib\site-packages\IPython\core\magic.pyc in <lambda>(f, *a, **k)
    186     # but it's overkill for just that one bit of state.
    187     def magic_deco(arg):
--> 188         call = lambda f, *a, **k: f(*a, **k)
    189 
    190         if callable(arg):

C:\Users\Byeon\Anaconda3\envs\py27\lib\site-packages\IPython\core\magics\execution.pyc in time(self, line, cell, local_ns)
   1174         if mode=='eval':
   1175             st = clock2()
-> 1176             out = eval(code, glob, local_ns)
   1177             end = clock2()
   1178         else:

<timed eval> in <module>()

C:\Users\Byeon\Anaconda3\envs\py27\lib\site-packages\IPython\core\interactiveshell.pyc in run_cell_magic(self, magic_name, line, cell)
   2113             magic_arg_s = self.var_expand(line, stack_depth)
   2114             with self.builtin_trap:
-> 2115                 result = fn(magic_arg_s, cell)
   2116             return result
   2117 

<decorator-gen-60> in time(self, line, cell, local_ns)

C:\Users\Byeon\Anaconda3\envs\py27\lib\site-packages\IPython\core\magic.pyc in <lambda>(f, *a, **k)
    186     # but it's overkill for just that one bit of state.
    187     def magic_deco(arg):
--> 188         call = lambda f, *a, **k: f(*a, **k)
    189 
    190         if callable(arg):

C:\Users\Byeon\Anaconda3\envs\py27\lib\site-packages\IPython\core\magics\execution.pyc in time(self, line, cell, local_ns)
   1178         else:
   1179             st = clock2()
-> 1180             exec(code, glob, local_ns)
   1181             end = clock2()
   1182             out = None

<timed exec> in <module>()

C:\Users\Byeon\Anaconda3\envs\py27\lib\site-packages\pandas\core\generic.pyc in astype(self, dtype, copy, raise_on_error, **kwargs)
   2948 
   2949         mgr = self._data.astype(dtype=dtype, copy=copy,
-> 2950                                 raise_on_error=raise_on_error, **kwargs)
   2951         return self._constructor(mgr).__finalize__(self)
   2952 

C:\Users\Byeon\Anaconda3\envs\py27\lib\site-packages\pandas\core\internals.pyc in astype(self, dtype, **kwargs)
   2936 
   2937     def astype(self, dtype, **kwargs):
-> 2938         return self.apply('astype', dtype=dtype, **kwargs)
   2939 
   2940     def convert(self, **kwargs):

C:\Users\Byeon\Anaconda3\envs\py27\lib\site-packages\pandas\core\internals.pyc in apply(self, f, axes, filter, do_integrity_check, consolidate, raw, **kwargs)
   2888 
   2889             kwargs['mgr'] = self
-> 2890             applied = getattr(b, f)(**kwargs)
   2891             result_blocks = _extend_blocks(applied, result_blocks)
   2892 

C:\Users\Byeon\Anaconda3\envs\py27\lib\site-packages\pandas\core\internals.pyc in astype(self, dtype, copy, raise_on_error, values, **kwargs)
    432                **kwargs):
    433         return self._astype(dtype, copy=copy, raise_on_error=raise_on_error,
--> 434                             values=values, **kwargs)
    435 
    436     def _astype(self, dtype, copy=False, raise_on_error=True, values=None,

C:\Users\Byeon\Anaconda3\envs\py27\lib\site-packages\pandas\core\internals.pyc in _astype(self, dtype, copy, raise_on_error, values, klass, mgr, **kwargs)
    475 
    476                 # _astype_nansafe works fine with 1-d only
--> 477                 values = com._astype_nansafe(values.ravel(), dtype, copy=True)
    478                 values = values.reshape(self.shape)
    479 

C:\Users\Byeon\Anaconda3\envs\py27\lib\site-packages\pandas\core\common.pyc in _astype_nansafe(arr, dtype, copy)
   1912 
   1913         if np.isnan(arr).any():
-> 1914             raise ValueError('Cannot convert NA to integer')
   1915     elif arr.dtype == np.object_ and np.issubdtype(dtype.type, np.integer):
   1916         # work around NumPy brokenness, #1987

ValueError: Cannot convert NA to integer

In [ ]: